notebook.community

Edit and run



In [1]:

    
import json
DATA_PATH = 'cleaned_data.json'
all_data = json.load(open(DATA_PATH))



In [2]:

    
import pandas as pd
import numpy as np
answers = pd.DataFrame(all_data)
answers['duration'] = answers.submit_time - answers.srv_time
answers.user_id = answers.user_id.astype(np.int)

# Clamp response time to 2 minutes.
answers.duration[answers.duration > 120] = 120

num_answered = answers.user_id.value_counts()
completions = num_answered[num_answered > 10].index
answers = answers[answers.user_id.isin(completions)]

# Some of the survey experiments did not get many responses, filter them out
source_counts = answers['type'].value_counts()



In [3]:

    
# So like 3 people completed surveys from twitter :)...
source_counts









    Out[3]:





Mechanical Turk 5 Cents               4984
Turk, asking for Fast                 2597
Facebook                               198
Mechanical Turk 2 Cents                143
Twitter                                 33
Mechanical Turk High Skill Request      11
AdWords, asking for 2 cents.            11
Tiffany                                 11
dtype: int64



In [4]:

    
answers[0:3]









    Out[4]:






  
    
      
      answer
      city
      country_name
      ip_hash
      latitude
      longitude
      question_id
      region_name
      srv_time
      submit_time
      survey_order_id
      type
      user_id
      version
      duration
    
  
  
    
      9 
              Obama speaking at his inauguration.
       Visalia
       United States
       3412077616522023084
       36.2958
      -119.3812
       3
       California
       1.389912e+09
       1.389912e+09
       21ejkavsdh
       Mechanical Turk High Skill Request
       42403892847
       0.1
       38.466112
    
    
      10
       I think they both should exercise equally.
       Visalia
       United States
       3412077616522023084
       36.2958
      -119.3812
       4
       California
       1.389912e+09
       1.389912e+09
       21ejkavsdh
       Mechanical Turk High Skill Request
       42403892847
       0.1
       25.887507
    
    
      11
                                              USA
       Visalia
       United States
       3412077616522023084
       36.2958
      -119.3812
       5
       California
       1.389912e+09
       1.389912e+09
       21ejkavsdh
       Mechanical Turk High Skill Request
       42403892847
       0.1
        4.623164



In [5]:

    
answers = answers[answers.type.isin(source_counts[source_counts > 100].index)]



In [6]:

    
import numpy as np
grouped = answers.groupby(['type', 'question_id'])['duration'].agg({'mean': np.mean, 'count': len, 'std': np.std})
grouped









    Out[6]:






  
    
      
      
      count
      std
      mean
    
    
      type
      question_id
      
      
      
    
  
  
    
      Facebook
      1 
        18
        6.528073
       10.140472
    
    
      2 
        18
       22.227931
       23.996327
    
    
      3 
        18
       30.022063
       37.650166
    
    
      4 
        18
       28.509485
       34.013478
    
    
      5 
        18
        3.045837
        6.149543
    
    
      6 
        18
       18.436770
       42.643649
    
    
      7 
        18
       25.016560
       40.394385
    
    
      8 
        18
       10.808506
       18.092980
    
    
      9 
        18
       17.671644
       26.297783
    
    
      10
        18
       17.864162
       18.963150
    
    
      11
        18
       22.103053
       35.284498
    
    
      Mechanical Turk 2 Cents
      1 
        13
       22.553344
       15.914699
    
    
      2 
        13
        9.001608
       22.677917
    
    
      3 
        13
       31.036975
       41.914349
    
    
      4 
        13
       30.803747
       28.500610
    
    
      5 
        13
        6.284513
        8.555612
    
    
      6 
        13
       13.852425
       34.589414
    
    
      7 
        13
       16.696772
       28.217106
    
    
      8 
        13
       10.241031
       16.047161
    
    
      9 
        13
       11.543667
       21.571579
    
    
      10
        13
       19.762374
       20.000313
    
    
      11
        13
       28.408319
       27.910834
    
    
      Mechanical Turk 5 Cents
      1 
       453
       12.450826
       14.048011
    
    
      2 
       454
       18.235401
       23.120424
    
    
      3 
       453
       24.965651
       36.441400
    
    
      4 
       453
       27.328623
       34.381162
    
    
      5 
       453
        7.824829
        8.795530
    
    
      6 
       453
       21.744677
       41.797269
    
    
      7 
       453
       21.407686
       35.237853
    
    
      8 
       453
       17.072572
       19.110877
    
    
      9 
       453
       26.264251
       32.945017
    
    
      10
       453
       23.375196
       25.831645
    
    
      11
       453
       18.884085
       29.102843
    
    
      Turk, asking for Fast
      1 
       236
       11.803632
       10.964369
    
    
      2 
       237
       13.104876
       15.321082
    
    
      3 
       236
       21.295649
       26.113626
    
    
      4 
       236
       19.836304
       21.603866
    
    
      5 
       236
        9.395384
        7.354463
    
    
      6 
       236
       19.971123
       33.152513
    
    
      7 
       236
       18.643996
       27.216647
    
    
      8 
       236
        7.632557
       12.295015
    
    
      9 
       236
       18.172701
       20.654331
    
    
      10
       236
       16.994212
       15.822472
    
    
      11
       236
       12.158383
       22.395870



In [7]:

    
questions = json.load(open('../app/survey.json'))
text_by_id = {q['id']: q['question'] for q in questions['questions']}

for q in questions['questions']:
    if 'prompt' not in q:
        continue
    answers.answer[(answers.question_id == q['id']) & (answers['answer'] ==  q['prompt'])] = 'DEFAULT'



In [8]:

    
print(answers['type'].unique())
answers['type'].value_counts()









    



['Turk, asking for Fast' 'Mechanical Turk 2 Cents' 'Facebook'
 'Mechanical Turk 5 Cents']






    Out[8]:





Mechanical Turk 5 Cents    4984
Turk, asking for Fast      2597
Facebook                    198
Mechanical Turk 2 Cents     143
dtype: int64

I'm just going to work on finding the timing differences between the 5 cent turk answers and the "Do it fast" turk answers.



In [9]:

    
%pylab inline

import matplotlib.pyplot as plt

def plot_single_question(question_id, survey_answers, answer_types,
                         normalize=False, ax=None):
    durations = [survey_answers[(survey_answers.type == t) &
                                (survey_answers.question_id == question_id)].duration
                 for t in answer_types]
    if normalize:
        weights = [np.ones_like(d) / len(d) for d in durations]           
    else:
        weights = None
    
    plt.hist(durations, label=list(answer_types), weights=weights)
    ax.legend(bbox_to_anchor=(1.7, .95))
    title(text_by_id[question_id][:80] + '  (question {})'.format(question_id))
    

def plot_all_questions(survey_answers, answer_types, question_ids, normalize=False):
    size = 5
    plt.figure(figsize=(size, len(question_ids) * size))
    for (i, q) in enumerate(question_ids):
        ax = plt.subplot(len(question_ids), 1, i + 1)
        
        plot_single_question(q, survey_answers, answer_types, normalize=normalize, ax=ax)









    



Populating the interactive namespace from numpy and matplotlib






    



/home/justinvf/anaconda/envs/blogging/lib/python3.3/site-packages/matplotlib/mathtext.py:46: UserWarning: Due to a bug in pyparsing <= 2.0.0 on Python 3.x, packrat parsing has been disabled.  Mathtext rendering will be much slower as a result.  Install pyparsing 2.0.0 or later to improve performance.
  warn("Due to a bug in pyparsing <= 2.0.0 on Python 3.x, packrat parsing "



In [10]:

    
plot_all_questions(answers[answers.duration < 50],
                   ['Turk, asking for Fast', 'Mechanical Turk 5 Cents'],
                   list(1 + i for i in range(11)),
                   normalize=True)



In [11]:

    
from scipy.stats import gaussian_kde
(fraudy, legit) = ('Turk, asking for Fast', 'Mechanical Turk 5 Cents')

fraudy_timings = {}
legit_timings = {}
for i in range(11):
    q_id = i + 1
    q_frame = answers.duration[answers.question_id == q_id]
    fraudy_timings[q_id] = gaussian_kde(q_frame[answers.type == fraudy])
    legit_timings[q_id] = gaussian_kde(q_frame[answers.type == legit])



In [12]:

    
SIZE = 5
NUM_QUESTIONS = len(legit_timings)
plt.figure(figsize=(SIZE, SIZE * NUM_QUESTIONS))

for i in sorted(fraudy_timings):
    plt.subplot(NUM_QUESTIONS, 1, i)
    title(text_by_id[i][:80] + '  (question {})'.format(i))
    fraud_kde = fraudy_timings[i]
    legit_kde = legit_timings[i]
    x = np.arange(0, 120, .1)
    plt.yticks([])
    plt.xlabel('Seconds to answer')
    plt.plot(x, fraud_kde.evaluate(x), 'r', label='more fraudulent')
    plt.plot(x, legit_kde.evaluate(x), 'g', label='more good')
    plt.legend()

Here the start of digging into some of the actual answer data, just for fun.



In [13]:

    
import re
political_regex = re.compile('.*(obama|jfk|kennedy|ronald|reagan|regan|clinton|bill cl|'
                             'george washington|george w|dukakis|saddam|'
                             'bush|carter|nixon|modi|gorbachev|lincoln|trudeau|'
                             'brezhnev|perot|'
                             'mahatma gandhi|nehru|gingrich|martin luther king|mlk|'
                             'rajiv gandhi|ford|rajive gandhi|eisenhower|'
                             'rahul gandhi|indira gandhi|nelson mandela|white house|'
                             'gandhi|thatcher).*', re.IGNORECASE)
answers['figure'] = answers[answers.question_id == 3]['answer'].str.match(political_regex)

def get_first(l):
    if type(l) == tuple and l:
        return l[0].lower()

answers.figure_clean = answers.figure.apply(get_first)



In [14]:

    
answers.figure_clean.value_counts()









    Out[14]:





clinton               102
bush                   80
obama                  59
reagan                 47
gandhi                 41
nixon                  20
modi                   20
kennedy                18
white house            15
regan                  12
carter                 10
nehru                   7
eisenhower              6
jfk                     6
nelson mandela          5
ronald                  4
thatcher                4
gorbachev               3
george washington       3
trudeau                 3
martin luther king      3
perot                   3
ford                    2
dukakis                 2
lincoln                 2
george w                1
saddam                  1
bill cl                 1
gingrich                1
brezhnev                1
dtype: int64



In [15]:

    
# Someone famous in India who I had no clue about:
answers.answer[answers.figure_clean == 'modi']









    Out[15]:





445                                         narendra modi
753                                                  modi
839               narendra modi speaking at a conferrence
946                         Narendra modi speech in stage
1093                                        narendra modi
1352                                       Narendran modi
1358                                         NARENRA MODI
1967                        modi spoken at mumbai grounds
2420                                                 modi
2560                                    narendra modi\r\n
2586                        Narendra Modi speech at Bihar
3192                                        narendra modi
3663                                        Narendra Modi
4619                                     MR.NARENDRA MODI
5286             While in past I heard the speach of Modi
5586                 narendra modi is speaking at gujarat
6009    Narendra modi announced as a Prime minister ca...
8383                    Modi spoken about Tea Shop worker
8527                                        Narendra modi
8548                                Modi speaking in dias
Name: answer, dtype: object



In [16]:

    
# Find the folks not captured by that regex:
def empty_tuple(x):
    return type(x) == type([]) and not x

list(answers[answers.figure.apply(empty_tuple) & (answers.answer != 'DEFAULT')].answer)









    Out[16]:





['yes',
 'ARVIND KEJRIWAL',
 'The president Salinas giving an speech on tv',
 'Quite moderate.',
 'Mr.karunanudhi addressing a speech in tamil nadu',
 'suresh',
 "'95",
 'speak about economy',
 'i hate politics',
 'congressman speaking at school',
 'cant recall',
 '"Read my lips. No new taxes."',
 'moderatly coserevative',
 'idk',
 'Rajai speaking for DMK',
 '5',
 'Voting',
 'BEHAVIOUR',
 'vijaykanth',
 'Delhi election',
 'full of problems and pressure',
 'liberally',
 '',
 'i saw one of them on tv.',
 'DR.MANMOHAN SINGH SMILING AT SOMEONE IN THE AUDIENCE',
 'prefer not to say',
 ' My earliest memory is of the angry bullies who lived next door to us. ... anger he channeled toward political figures was rooted in something other than that',
 'prathipa patel',
 'Mahatma speaking in front of a crowd which I saw as a video on Tv',
 'Abdul Kalam',
 'no idea',
 'election',
 'subash chandra bos',
 'manmohan singh (Indian prime minister) speaking in his 3rd ever conference in the last 2 decades.',
 'Abdul Kalam',
 'Flag hoisting',
 'MGR ',
 'a meeting',
 'Arvind k in rajya sabha',
 'Prime Minister taking action',
 'NIL',
 'Moi',
 'Sheikh Hasina',
 'C P Muhammed MLA speaking in Pattambi',
 'Conference',
 'bharatiya janata party',
 'J Jayalalitha meet in my hometown.',
 'none',
 'sonia',
 'ntng',
 'usa',
 'Waste',
 'M.G.R',
 'LITTLE',
 'no idea',
 'Atal Bihari Vajpayee speaking at BJP party office',
 "Rahul's campaign speech ",
 'obema',
 'ana hazare speking of black mone',
 'aravind kejrival',
 'A P J Abdul Kalam',
 'Karunanithy',
 'Mr.karunanidhi, Former Chief Minister of Tamil Nadu speaking in a public meeting',
 'I AM NOT INTRESTED IN THAT',
 'attending a governors speech at age 8 or 9',
 'Smith\r\n',
 'abdul kalam',
 '',
 'Kejariwal becomes chief minister of Delhi',
 'SASI THAROOR',
 'Lyndon Johnson on the TV news',
 'senate',
 'Nothing',
 'i hate politics',
 '',
 'Aravind Kejriwal speech at Parliament',
 'Recent activity about devayani in the visa case',
 'mick ',
 'english',
 'jayalalitha ruling',
 '8',
 'None',
 'I have no idea.',
 'kejriwal became mp.',
 'Aravind kejarival won election at delhi',
 'bjp',
 'Regean being in trouble for Iran-Contra',
 'london',
 'My earliest memory of a political figure is vijayakandh spoke at Assembly.',
 'Good management',
 'hard to collect, but the one speach given by Atal Bihari was unforgetable.',
 'yes\r\n',
 '',
 'dishonesty',
 'nothing',
 'politics is simply like a useless material which is handed over to old guy to damage even more.',
 'LIBERAL',
 'chicago park, never',
 'benezir bhutto killed during campaign',
 'ldf',
 'Cris Daly of the SF Board of Supervisors cursing someone out.',
 'memorial services',
 'america',
 '',
 '4th grade',
 'aam aadmi',
 'none',
 'Indian primeminister Atal Bihai Vajpayee speech',
 'I DONT LIKE POLITICS',
 'speaking to parliment during question time',
 'Mahatha ghandhi biography',
 'Learning that James Bulger was the Prime Minister of New Zealand in 1992.',
 'As a little kid, wondering why Jean Chretien had a sideways mouth and talked sideways, after seeing him on tv.',
 'Abraham Lincon',
 'Pres. Johnson on TV when I was a young child, discussing Vietnam. . ',
 'The past leader of Malaysia, Tun Doktor Mahathir.',
 'interview',
 'My stepdad being upset over RFK being killed,',
 'Medical camp of TDP on NTR annual day.',
 "I was sitting in the waiting room of a VA Hospital.  I must have been 5 or younger.  I just remember some politician talking about China.  I'm not sure, but I think it was communist conversion.  Really vague memory.",
 'not interested',
 'hema',
 'Nothing',
 'Manmohan singh as finance minister',
 'english',
 'nil',
 'n/a',
 'i dont',
 'Chandra babu speaking in assembly',
 '0',
 '1978',
 'AAP speech',
 'I remember the italian President, Oscar Luigi Scalfaro',
 '9',
 'adal bihari vaajpaayi is prime minister',
 '7',
 'jayalalitha',
 'nothing',
 'Seeing IKE on tv as a small boy',
 'martin ',
 'nothing',
 'America, Work',
 'None',
 "I don't remember who it was exactly, but the politician who killed himself live on TV.",
 'Central election party AAP',
 'vajpai',
 'Eating bread',
 "Don't care",
 'liberal ',
 'nice',
 'reagon',
 '48',
 'High school',
 '1994 elections',
 'CKINTON',
 'The first president',
 "Don't have any not really into watching or reading anything political.",
 'None.',
 'when i was 14',
 'washington',
 'Bill getting his "surprise" ',
 'today and yesterday\r\n',
 'whitehouse',
 "Don't know",
 'kalainger',
 'infuencial',
 'na',
 "Clint's big scandal",
 'na',
 'john',
 'zulfiqar bhutto',
 'Our leader giving lecture instead of doing work.',
 'Atal bihari vajpayee speech at parliament',
 'Raegan Breaking down wall.',
 'When I was in 2nd grade.',
 'center',
 'JEYA LALITHA ',
 'our C.M is introducing lot of welfare to people',
 'BJP winning in India',
 'mani',
 'manmohanshing',
 'No idea',
 'Hitler',
 'karunanithi',
 'regarding political instability',
 'California regarding the financial cresis',
 'watching the political debates in tenth grade',
 'Kalaignar  many speeches in political',
 'miss.j. jeyalalitha provided laptop to school and college students',
 'the president',
 'my sister',
 'Chandra Babu naidu winning the electoins',
 'nope',
 'waste. I hate democracy',
 'an Italian politician',
 "congress leader's murder",
 'arvind kejriwal',
 'Attal Bihari vajpeyi visited the Tsunami affected areas.']



In [17]:

    
def score_for_user(user_id, initial_fraud_probability=.1):
    
    fraud_probability = initial_fraud_probability
    nonfraud_probability = 1 - initial_fraud_probability
    the_data = answers[answers.user_id == user_id].sort(columns=('question_id',))[[
        'question_id', 'answer', 'duration']]
    partial_results = []
    
    for r in the_data.iterrows():
        (question_id, answer, duration) = r[1]
        fraud_likelihood = fraudy_timings[question_id].evaluate(duration)[0]
        fraud_probability *= fraud_likelihood
        
        nonfraud_likelihood = legit_timings[question_id].evaluate(duration)[0]
        nonfraud_probability *= nonfraud_likelihood
        
        normalizer = nonfraud_probability + fraud_probability
        
        fraud_probability /= normalizer
        nonfraud_probability /= normalizer
        
        partial_results.append({'question_id': question_id,
                                'duration': duration,
                                'answer': answer,
                                'fraud_p': fraud_probability,
                                'nonfraud_p': nonfraud_probability})
    
    return partial_results



In [18]:

    
score_for_user(87397087779)









    Out[18]:





[{'answer': 'DEFAULT',
  'duration': 17.01549005508423,
  'fraud_p': 0.078448550659764929,
  'nonfraud_p': 0.92155144934023514,
  'question_id': 1},
 {'answer': 'know',
  'duration': 16.06438899040222,
  'fraud_p': 0.078327530808187787,
  'nonfraud_p': 0.92167246919181234,
  'question_id': 2},
 {'answer': 'yes',
  'duration': 7.856693983078003,
  'fraud_p': 0.1601986332351793,
  'nonfraud_p': 0.83980136676482064,
  'question_id': 3},
 {'answer': 'no',
  'duration': 17.965824127197266,
  'fraud_p': 0.18695987381608123,
  'nonfraud_p': 0.81304012618391885,
  'question_id': 4},
 {'answer': 'ethiopia',
  'duration': 11.613417863845825,
  'fraud_p': 0.14424050342554573,
  'nonfraud_p': 0.85575949657445438,
  'question_id': 5},
 {'answer': 'immigration:1 transportation:1 healthcare:3 education:3 warfare:2',
  'duration': 37.87310600280762,
  'fraud_p': 0.13353764904151283,
  'nonfraud_p': 0.86646235095848712,
  'question_id': 6},
 {'answer': 'radish:2 lettuce:3 eggplant:6 tomato:6 aubergine:2 kiwi:8',
  'duration': 38.65355896949768,
  'fraud_p': 0.082402883940254051,
  'nonfraud_p': 0.91759711605974603,
  'question_id': 7},
 {'answer': 'tiger',
  'duration': 17.080639123916626,
  'fraud_p': 0.060961446981524434,
  'nonfraud_p': 0.93903855301847561,
  'question_id': 8},
 {'answer': '2 bad kids',
  'duration': 16.7669038772583,
  'fraud_p': 0.082194120577559093,
  'nonfraud_p': 0.91780587942244085,
  'question_id': 9},
 {'answer': 'no',
  'duration': 14.701430082321167,
  'fraud_p': 0.091859498736522979,
  'nonfraud_p': 0.90814050126347701,
  'question_id': 10},
 {'answer': 'Nabokov:1 Obama:4 Fidel Castro:1 Your favorite TV host:3 Babe Ruth:1',
  'duration': 32.44860100746155,
  'fraud_p': 0.062823282273206923,
  'nonfraud_p': 0.93717671772679312,
  'question_id': 11}]



In [19]:

    
answers.duration[answers.duration < 10].round(0).value_counts()









    Out[19]:





8     322
7     317
6     305
9     277
5     268
4     172
10    129
3      78
1      45
2      37
0       4
dtype: int64



In [20]:

    
all_scores = []
for u in answers.user_id.unique():
    all_scores.append(score_for_user(u))

plt.hist([d[-1]['fraud_p'] for d in all_scores])
plt.xlabel('Final probability of fraud')
plt.ylabel('Number of people')









    Out[20]:





<matplotlib.text.Text at 0x7f2b58c9d810>



In [21]:

    
#Looking at some example

from pprint import pprint

def get_instance(collection, lower, upper, function):
    g = list(c for c in collection if lower <= function(c) <= upper)
    if g:
        return g[random.randint(0, len(g) - 1)]

get_fraud_p = lambda c: c[-1]['fraud_p']

print('Good surveys')
short_questions = {1: "First name",
 2: "People with your name honest?",
 3: "Earliest political memory?",
 4: "Men or women need more exercise?",
 5: "What country do you live in?",
 6: "Allocating money to different departments",
 7: "How saw would you be if various plants went away?",
 8: "What animal would you not want to leave with a sheep?",
 9: "10 kids, 1 evil kid, 0 kids, or 2 bad kids?",
 10: "Do you have any idea what the word 'Telluride' means?",
 11: "Who would your parents like?"}

def to_table(answers):
    rows = ['<tr><td>{}</td><td>{}</td><td>{:0.3f}</td><td>{:0.5f}</td></tr>'
            .format(short_questions[r['question_id']], r['answer'], r['duration'], r['fraud_p'])
            for r in answers]
    return ('<table>\n'
            '<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>\n'
            '\n{}\n'
            '</table>'.format('\n'.join(rows)))
            


print('Good table\n', to_table(get_instance(all_scores, .0, .1, get_fraud_p)))
print('Bad table\n', to_table(get_instance(all_scores, .9, 1, get_fraud_p)))









    



Good surveys
Good table
 <table>
<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>

<tr><td>First name</td><td>Satia</td><td>6.209</td><td>0.11934</td></tr>
<tr><td>People with your name honest?</td><td>As much as anyone with any other name.</td><td>22.490</td><td>0.09134</td></tr>
<tr><td>Earliest political memory?</td><td>Nixon being impeached.</td><td>27.930</td><td>0.07606</td></tr>
<tr><td>Men or women need more exercise?</td><td>I think they need the same amount.</td><td>20.970</td><td>0.07745</td></tr>
<tr><td>What country do you live in?</td><td>US</td><td>5.400</td><td>0.07367</td></tr>
<tr><td>Allocating money to different departments</td><td>immigration:2 healthcare:3 education:3 warfare:0 transportation:2</td><td>31.218</td><td>0.07538</td></tr>
<tr><td>How saw would you be if various plants went away?</td><td>radish:3 lettuce:10 eggplant:6 tomato:10 aubergine:6 kiwi:3</td><td>27.000</td><td>0.07947</td></tr>
<tr><td>What animal would you not want to leave with a sheep?</td><td>A wolf</td><td>13.068</td><td>0.09462</td></tr>
<tr><td>10 kids, 1 evil kid, 0 kids, or 2 bad kids?</td><td>I already have 3 children but I definitely don't want anymore.</td><td>22.825</td><td>0.09188</td></tr>
<tr><td>Do you have any idea what the word 'Telluride' means?</td><td>It's something doing with science and a metal, I think, if I remember correctly.</td><td>39.526</td><td>0.03235</td></tr>
<tr><td>Who would your parents like?</td><td>Nabokov:2 Obama:4 Fidel Castro:1 Your favorite TV host:1 Babe Ruth:1</td><td>26.100</td><td>0.02641</td></tr>
</table>
Bad table
 <table>
<th><td>Answer</td><td>Duration</td><td>Fraud Probability</td></th>

<tr><td>First name</td><td>Natalie</td><td>6.862</td><td>0.11502</td></tr>
<tr><td>People with your name honest?</td><td>Yes, they are.</td><td>8.962</td><td>0.17751</td></tr>
<tr><td>Earliest political memory?</td><td>President Ronald Reagan speaking on television.</td><td>20.161</td><td>0.19511</td></tr>
<tr><td>Men or women need more exercise?</td><td>Men over 50 need about the same amount of exercise as women over 45.</td><td>17.454</td><td>0.23138</td></tr>
<tr><td>What country do you live in?</td><td>USA</td><td>4.092</td><td>0.25967</td></tr>
<tr><td>Allocating money to different departments</td><td>immigration:1 healthcare:3 education:5 warfare:0 transportation:2</td><td>13.936</td><td>0.41175</td></tr>
<tr><td>How saw would you be if various plants went away?</td><td>radish:9 lettuce:10 eggplant:6 tomato:7 aubergine:6 kiwi:5</td><td>18.782</td><td>0.51566</td></tr>
<tr><td>What animal would you not want to leave with a sheep?</td><td>A wolf</td><td>5.541</td><td>0.68619</td></tr>
<tr><td>10 kids, 1 evil kid, 0 kids, or 2 bad kids?</td><td>0 kids</td><td>5.290</td><td>0.80988</td></tr>
<tr><td>Do you have any idea what the word 'Telluride' means?</td><td>No, never heard of it</td><td>6.407</td><td>0.88554</td></tr>
<tr><td>Who would your parents like?</td><td>Nabokov:1 Obama:1 Fidel Castro:1 Your favorite TV host:2 Babe Ruth:3</td><td>15.959</td><td>0.92446</td></tr>
</table>



In [22]:

    
pdf_data = {'range': [0, 120],
            'step_size': .25}
pdf_values = {}

x = np.arange(pdf_data['range'][0], pdf_data['range'][1], pdf_data['step_size'])

for i in range(1, 12):
    fraud_kde = fraudy_timings[i]
    legit_kde = legit_timings[i]
    pdf_values[i] = {'legit': list(legit_kde.evaluate(x)),
                     'fraudy': list(fraud_kde.evaluate(x))}
pdf_data['values'] = pdf_values

json.dump(pdf_data, open('fraud_model_pdf.json', 'wt'))



In [ ]:

	answer	city	country_name	ip_hash	latitude	longitude	question_id	region_name	srv_time	submit_time	survey_order_id	type	user_id	version	duration
9	Obama speaking at his inauguration.	Visalia	United States	3412077616522023084	36.2958	-119.3812	3	California	1.389912e+09	1.389912e+09	21ejkavsdh	Mechanical Turk High Skill Request	42403892847	0.1	38.466112
10	I think they both should exercise equally.	Visalia	United States	3412077616522023084	36.2958	-119.3812	4	California	1.389912e+09	1.389912e+09	21ejkavsdh	Mechanical Turk High Skill Request	42403892847	0.1	25.887507
11	USA	Visalia	United States	3412077616522023084	36.2958	-119.3812	5	California	1.389912e+09	1.389912e+09	21ejkavsdh	Mechanical Turk High Skill Request	42403892847	0.1	4.623164

		count	std	mean
type	question_id
Facebook	1	18	6.528073	10.140472
	2	18	22.227931	23.996327
	3	18	30.022063	37.650166
	4	18	28.509485	34.013478
	5	18	3.045837	6.149543
	6	18	18.436770	42.643649
	7	18	25.016560	40.394385
	8	18	10.808506	18.092980
	9	18	17.671644	26.297783
	10	18	17.864162	18.963150
	11	18	22.103053	35.284498
Mechanical Turk 2 Cents	1	13	22.553344	15.914699
	2	13	9.001608	22.677917
	3	13	31.036975	41.914349
	4	13	30.803747	28.500610
	5	13	6.284513	8.555612
	6	13	13.852425	34.589414
	7	13	16.696772	28.217106
	8	13	10.241031	16.047161
	9	13	11.543667	21.571579
	10	13	19.762374	20.000313
	11	13	28.408319	27.910834
Mechanical Turk 5 Cents	1	453	12.450826	14.048011
	2	454	18.235401	23.120424
	3	453	24.965651	36.441400
	4	453	27.328623	34.381162
	5	453	7.824829	8.795530
	6	453	21.744677	41.797269
	7	453	21.407686	35.237853
	8	453	17.072572	19.110877
	9	453	26.264251	32.945017
	10	453	23.375196	25.831645
	11	453	18.884085	29.102843
Turk, asking for Fast	1	236	11.803632	10.964369
	2	237	13.104876	15.321082
	3	236	21.295649	26.113626
	4	236	19.836304	21.603866
	5	236	9.395384	7.354463
	6	236	19.971123	33.152513
	7	236	18.643996	27.216647
	8	236	7.632557	12.295015
	9	236	18.172701	20.654331
	10	236	16.994212	15.822472
	11	236	12.158383	22.395870